Analise Exploratoria: loan_data

Estudo de modelagem de classificação

Code
# Análise exploratória de dados: 

# Conjunto de dados: datasets/loan_data.csv
## https://www.kaggle.com/datasets/saramah/loan-data/data

# Pacotes e conjunto de dados ---------------------------------------------

library(tidyverse)
library(enoqueR)
library(gt)

dados = read_csv("datasets/loan_data.csv")

1 Visão geral

Code
bind_rows(
  enoqueR::overall_info(dados),
  enoqueR::overall_tipos(dados)
  ) %>% 
  gt() %>% 
  opt_interactive()
Code
dados %>% 
  head() %>%
  gt() %>% 
  fmt_number(where(is.numeric)) %>%
  opt_interactive()
Code
variaveis_pt <- c(
  "politica_credito" = "credit.policy",
  "proposito_emp" = "purpose",
  "taxa_juros" = "int.rate",
  "parcela_mensal" = "installment",
  "log_renda_anual" = "log.annual.inc",
  "indice_divida_renda" = "dti",
  "pontuacao_fico" = "fico",
  "dias_com_credito" = "days.with.cr.line",
  "saldo_rotativo" = "revol.bal",
  "taxa_utilizacao_rotativa" = "revol.util",
  "consultas_ultimos_6_meses" = "inq.last.6mths",
  "atrasos_ultimos_2_anos" = "delinq.2yrs",
  "registros_publicos_negativos" = "pub.rec"
)
Code
dados <- dados %>% 
  rename(all_of(variaveis_pt)) %>% 
  mutate(proposito_emp = case_when(
    proposito_emp == "credit_card" ~ "Cartão de credito",
    proposito_emp == "debt_consolidation" ~ "Consolidação de divida",
    proposito_emp == "educational" ~ "Educacional",
    proposito_emp == "major_purchase" ~ "Compra grande",
    proposito_emp == "small_business" ~ "Pequeno Negócio",
    proposito_emp == "all_other" ~ "Outro",
    .default = proposito_emp
    ),
    politica_credito = fct(if_else(politica_credito == 1, "Sim", "Não"), levels = c("Não", "Sim"))
  )

2 Análise descritiva

Code
dados %>% 
  enoqueR::tbl_resumo()
Estatísticas Descritivas
Variável N (%) Média (DP) Mediana (IQR) Min - Max
taxa_juros - 0.1 (0.0) 0.1 (0.1, 0.1) 0.1 - 0.2
parcela_mensal - 319.1 (207.1) 269.0 (163.8, 432.9) 15.7 - 940.1
log_renda_anual - 10.9 (0.6) 10.9 (10.6, 11.3) 7.5 - 14.5
indice_divida_renda - 12.6 (6.9) 12.7 (7.2, 18.0) 0.0 - 30.0
pontuacao_fico - 710.8 (38.0) 707.0 (682.0, 737.0) 612.0 - 827.0
dias_com_credito - 4,560.8 (2,496.9) 4,140.0 (2,820.0, 5,730.0) 179.0 - 17,640.0
saldo_rotativo - 16,914.0 (33,756.2) 8,596.0 (3,187.0, 18,252.0) 0.0 - 1,207,359.0
taxa_utilizacao_rotativa - 46.8 (29.0) 46.3 (22.6, 70.9) 0.0 - 119.0
consultas_ultimos_6_meses - 1.6 (2.2) 1.0 (0.0, 2.0) 0.0 - 33.0
atrasos_ultimos_2_anos - 0.2 (0.5) 0.0 (0.0, 0.0) 0.0 - 13.0
registros_publicos_negativos - - - -
    0 - 9,019.0 (94.2%) 9,019.0 (94.2%) 9,019.0 (94.2%)
    1 - 533.0 (5.6%) 533.0 (5.6%) 533.0 (5.6%)
    2 - 19.0 (0.2%) 19.0 (0.2%) 19.0 (0.2%)
    3 - 5.0 (0.1%) 5.0 (0.1%) 5.0 (0.1%)
    4 - 1.0 (0.0%) 1.0 (0.0%) 1.0 (0.0%)
    5 - 1.0 (0.0%) 1.0 (0.0%) 1.0 (0.0%)
not.fully.paid - 1,533.0 (16.0%) 1,533.0 (16.0%) 1,533.0 (16.0%)
politica_credito - - - -
    Não 1,868.0 (19.5%) - - -
    Sim 7,710.0 (80.5%) - - -
proposito_emp - - - -
    Cartão de credito 1,262.0 (13.2%) - - -
    Compra grande 437.0 (4.6%) - - -
    Consolidação de divida 3,957.0 (41.3%) - - -
    Educacional 343.0 (3.6%) - - -
    home_improvement 629.0 (6.6%) - - -
    Outro 2,331.0 (24.3%) - - -
    Pequeno Negócio 619.0 (6.5%) - - -
Code
dados %>% 
    enoqueR::variaveis() %>% 
    gt() %>% 
    fmt_number() %>% 
    tab_style(
      style = "vertical-align:middle; font-weight: bold",
      locations = cells_column_labels()
    )
name atrasos_ultimos_2_anos consultas_ultimos_6_meses dias_com_credito indice_divida_renda log_renda_anual not.fully.paid parcela_mensal pontuacao_fico registros_publicos_negativos saldo_rotativo taxa_juros taxa_utilizacao_rotativa politica_credito proposito_emp
n 9,578.00 9,578.00 9,578.00 9,578.00 9,578.00 9,578.00 9,578.00 9,578.00 9,578.00 9,578.00 9,578.00 9,578.00 9,578.00 9,578.00
min 0.00 0.00 178.96 0.00 7.55 0.00 15.67 612.00 0.00 0.00 0.06 0.00 NA NA
max 13.00 33.00 17,639.96 29.96 14.53 1.00 940.14 827.00 5.00 1,207,359.00 0.22 119.00 NA NA
median 0.00 1.00 4,139.96 12.66 10.93 0.00 268.95 707.00 0.00 8,596.00 0.12 46.30 NA NA
q1 0.00 0.00 2,820.00 7.21 10.56 0.00 163.77 682.00 0.00 3,187.00 0.10 22.60 NA NA
q3 0.00 2.00 5,730.00 17.95 11.29 0.00 432.76 737.00 0.00 18,249.50 0.14 70.90 NA NA
iqr 0.00 2.00 2,910.00 10.74 0.73 0.00 268.99 55.00 0.00 15,062.50 0.04 48.30 NA NA
mad 0.00 1.48 2,135.07 7.98 0.54 0.00 184.88 37.06 0.00 9,619.11 0.03 35.88 NA NA
mean 0.16 1.58 4,560.77 12.61 10.93 0.16 319.09 710.85 0.06 16,913.96 0.12 46.80 NA NA
sd 0.55 2.20 2,496.93 6.88 0.61 0.37 207.07 37.97 0.26 33,756.19 0.03 29.01 NA NA
se 0.01 0.02 25.51 0.07 0.01 0.00 2.12 0.39 0.00 344.92 0.00 0.30 NA NA
ci 0.01 0.04 50.01 0.14 0.01 0.01 4.15 0.76 0.00 676.11 0.00 0.58 NA NA
var 0.30 4.84 6,234,659.42 47.39 0.38 0.13 42,878.40 1,441.80 0.07 1,139,480,363.32 0.00 841.81 NA NA
range 13.00 33.00 17,461.00 29.96 6.98 1.00 924.47 215.00 5.00 1,207,359.00 0.16 119.00 NA NA
cv 1.82 3.07 1,367.02 3.76 0.03 0.84 134.38 2.03 1.11 67,369.21 0.01 17.99 NA NA
distinct 11.00 28.00 2,687.00 2,529.00 1,987.00 2.00 4,788.00 44.00 6.00 7,869.00 249.00 1,035.00 2.00 7.00
ausente 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
ausente_pct 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
zeros 8,458.00 3,637.00 0.00 89.00 0.00 8,045.00 0.00 0.00 9,019.00 321.00 0.00 297.00 NA NA
zeros_pct 0.88 0.38 0.00 0.01 0.00 0.84 0.00 0.00 0.94 0.03 0.00 0.03 NA NA
q.5 0.00 0.00 1,320.04 1.27 9.92 0.00 65.56 657.00 0.00 127.70 0.08 1.10 NA NA
q.95 1.00 5.00 9,329.96 23.65 11.92 1.00 756.27 782.00 1.00 57,654.30 0.17 94.00 NA NA
curtose 74.40 29.27 4.94 2.10 4.61 4.44 3.14 2.58 41.76 262.52 2.78 1.88 NA NA
assimetria 6.06 3.58 1.16 0.02 0.03 1.85 0.91 0.47 5.13 11.16 0.16 0.06 NA NA
Code
dados %>% 
    enoqueR::eda_tabela_dataset_resumo_num()
Code
dados %>% 
  enoqueR::eda_visual_dataset_hist()

Code
dados %>% 
  enoqueR::eda_visual_dataset_resposta(resposta = "politica_credito")

3 Próximos passos

  • Análisar associação/correlação entre a covariaveis